We downloaded the data using SRAtools on grace using the scripts generated by the following codes. We downloaded the sratools from github and installed it on grace with setup. All files downloaded were not zipped so we also zipped each file into fastq.gz file.
id.list<-readLines("/Users/yanxiting/Downloads/SRR_Acc_List(2).txt")
temp1<-paste("/home/xy48/scratch_palmer/sratoolkit/sratoolkit.3.0.0-ubuntu64/bin/prefetch.3.0.0 -X 9999999999999 ",id.list,"\n",sep="")
temp2<-paste("/home/xy48/scratch_palmer/sratoolkit/sratoolkit.3.0.0-ubuntu64/bin/fastq-dump.3.0.0 --split-files ./",id.list,".sra\n",sep="")
temp3<-rep("cd /home/xy48/scratch_palmer/SARC_10x/test/sra\n",length(temp1))
temp4<-paste("gzip ./",id.list,"_*.fastq\n",sep="")
temp5<-rep("ssh transfer\n",length(temp1))
cmd.out<-cbind(temp5,temp1,temp3,temp2,temp4)
temp.cmd<-apply(cmd.out,1,paste,collapse="",sep="")
cat(temp.cmd,file="./download_commands.txt",append=F,sep="")The cell ranger pipeline recognize fastq file names are formated as [Sample Name]S1_L00[Lane Number][Read Type]_001.fastq.gz. There are 3 files per sample representing I1, R1 and R2 downloaded by sratools, which were named as *_1, *_2 and *_3.fastq files. To correctly run cell ranger on these files, we need to change the fastq.gz names.
# this was run on grace.
source("~/Rprogram/my_functions.R")
data.dir<-"/home/xy48/scratch_palmer/SARC_10x/test/sra"
filenames<-list.files(data.dir)
temp<-sapply(filenames,my.element.extract,splitchar="\\.",index=3)
filenames<-filenames[temp=="gz"]
filenames<-filenames[!is.na(filenames)]
for(i in 1:length(filenames)){
if(substr(filenames[i],1,4)=="SRR9"){
from.filename<-filenames[i]
temp<-my.element.extract(filenames[i],splitchar="\\.",index=1)
temp<-my.element.extract(temp,splitchar="_",index=2)
if(temp=="1"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_I1_001.fastq.gz")
}
if(temp=="2"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_R1_001.fastq.gz")
}
if(temp=="3"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_R2_001.fastq.gz")
}
file.rename(from=file.path(data.dir,from.filename),to=file.path(data.dir,to.filename))
cat("from=",file.path(data.dir,from.filename),"\n","to=",file.path(data.dir,to.filename),"\n",sep="")
}else{
from.filename<-filenames[i]
temp<-my.element.extract(filenames[i],splitchar="\\.",index=1)
temp<-my.element.extract(temp,splitchar="_",index=2)
if(temp=="1"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_R1_001.fastq.gz")
}
if(temp=="2"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_R2_001.fastq.gz")
}
if(temp=="3"){
to.filename<-paste0(my.element.extract(filenames[i],splitchar="_",index=1),"_S1_L001_I1_001.fastq.gz")
}
file.rename(from=file.path(data.dir,from.filename),to=file.path(data.dir,to.filename))
cat("from=",file.path(data.dir,from.filename),"\n","to=",file.path(data.dir,to.filename),"\n",sep="")
}
}We rsync the fastq.gz files onto farnam under ~/scratch60/SARC_10X/fastq and run cell ranger on the files using commands generated by the following codes.
id.list<-readLines("/Users/yanxiting/Downloads/SRR_Acc_List(2).txt")
temp1<-paste("#!/bin/bash\n#SBATCH --time=24:00:00 --ntasks=1 --partition=general --cpus-per-task=8 --mem=80GB --job-name=",id.list," -o /home/xy48/scratch60/SARC_10X/cellranger_scripts/",id.list,".sh.o%J -e /home/xy48/scratch60/SARC_10X/cellranger_scripts/",id.list,".sh.e%J\n",sep="")
temp2<-rep("module load cellranger/5.0.0\nmodule load bcl2fastq2/2-20-0-foss-2018b\n",length(id.list))
temp3<-paste("cd /home/xy48/scratch60/SARC_10X/cellranger_results\n")
temp4<-paste("cellranger count --id=",id.list," --fastqs=/home/xy48/scratch60/SARC_10X/fastq --transcriptome=/home/xy48/scratch60/SARC_10X/refdata-gex-GRCh38-2020-A --localcores=8 --localmem=60 --sample=",id.list,"\n",sep="")
cmd.out<-cbind(temp1,temp2,temp3,temp4)
temp.cmd<-apply(cmd.out,1,paste,collapse="",sep="")
cat(temp.cmd,file="./cellranger_scripts.txt",append=F,sep="")There are samples with multiple runs. We run cellranger aggr to merge the multiple runs into one single nUMI vector.
source(file.path(home.dir,"Rprogram/my_functions.R"))
library(xlsx)
home.dir<-"/home/yanxiting/driver_Farnam"
runtable.filepath<-file.path(home.dir,"scratch60/SARC_10X/SraRunTable.txt")
id.filepath<-file.path(home.dir,"scratch60/SARC_10X/scRNA_ids.xlsx")
run.table<-read.table(runtable.filepath,sep="\t",header=T,check.names = F)
id.table<-read.xlsx(id.filepath,sheetIndex = 1,check.names=F)
temp.table<-run.table[run.table$`GEO_Accession (exp)`%in%as.matrix(id.table)[,1],]
temp.list<-split(as.matrix(temp.table)[,1],temp.table$`GEO_Accession (exp)`)
# extract the samples with multiple runs
temp.list<-temp.list[unlist(lapply(temp.list,length))>1]
# find out the fastq part these IDs belong to
temp.filenames<-list.files(file.path(home.dir,"scratch60/SARC_10X"))
temp.filenames<-temp.filenames[grep(".filelist",temp.filenames)]
temp.filenames<-temp.filenames[grep("fastq",temp.filenames)]
file.list<-list()
for(i in 1:length(temp.filenames)){
temp<-readLines(file.path(home.dir,"scratch60/SARC_10X",temp.filenames[i]))
temp<-unname(sapply(temp,my.element.extract,splitchar="/",index=-1))
temp<-unname(sapply(temp,my.element.extract,splitchar="_",index=1))
temp<-unique(temp)
file.list[[i]]<-temp
}
names(file.list)<-temp.filenames
temp<-character()
for(i in 1:length(file.list)){
temp<-c(temp,rep(names(file.list)[i],length(file.list[[i]])))
}
file.list.matrix<-cbind(unlist(file.list),temp)
temp<-list()
for(i in 1:length(temp.list)){
temp[[i]]<-unique(file.list.matrix[file.list.matrix[,1]%in%temp.list[[i]],2])
}
# transfer the cellranger results back to farnam
# generate the aggregation CSV file
cellranger.dir<-file.path(home.dir,"scratch60/SARC_10X/cellranger_results")
output.dir<-file.path(home.dir,"scratch60/SARC_10X/cellranger_aggr_scripts")
for(i in 1:length(temp.list)){
output.filepath<-file.path(output.dir,paste(names(temp.list)[i],"_aggr.csv",sep=""))
# pipeline before cellranger 6.0, use library. Otherwise, use sample_id
cmd.out<-c("library_id","molecule_h5")
cmd.out<-rbind(cmd.out,cbind(temp.list[[i]],paste("/home/xy48/scratch60/SARC_10X/cellranger_results/",temp.list[[i]],"/outs/molecule_info.h5",sep="")))
write.table(cmd.out,file=output.filepath,row.names=F,col.names=F,sep=",",append=F,quote=F)
}
# generate the sh file to run cellranger aggr on the replicated runs of the same sample.
script.dir<-file.path(home.dir,"scratch60/SARC_10X/cellranger_aggr_scripts")
result.dir<-file.path(home.dir,"scratch60/SARC_10X/cellranger_aggr_results")
for(i in 1:length(temp.list)){
csv.filepath<-file.path(script.dir,paste(names(temp.list)[i],"_aggr.csv",sep=""))
script.filepath<-file.path(script.dir,paste(names(temp.list)[i],".sh",sep=""))
cmd.out<-paste("#!/bin/bash\n#SBATCH --time=24:00:00 --ntasks=1 --partition=general --cpus-per-task=1 --mem=80GB --job-name=",names(temp.list)[i]," -o /home/xy48/scratch60/SARC_10X/cellranger_aggr_scripts/",names(temp.list)[i],".sh.o%J -e /home/xy48/scratch60/SARC_10X/cellranger_aggr_scripts/",names(temp.list)[i],".sh.e%J\n",sep="")
cmd.out<-paste(cmd.out,"module load cellranger/5.0.0\nmodule load bcl2fastq2/2-20-0-foss-2018b\n",sep="")
cmd.out<-paste(cmd.out,"cd /home/xy48/scratch60/SARC_10X/cellranger_aggr_results\n",sep="")
cmd.out<-paste(cmd.out,"cellranger aggr --id=",names(temp.list)[i]," --csv=/home/xy48/scratch60/SARC_10X/cellranger_aggr_scripts/",names(temp.list)[i],"_aggr.csv\n",sep="")
cat(cmd.out,file=script.filepath,append=F)
}We moved the cellranger aggr results back to cellranger_results together with the samples with unique run.
We extract the nUMI vector for the samples included in the original paper.
home.dir<-"/home/yanxiting/driver_Grace"
source(file.path(home.dir,"Rprogram/my_functions.R"))
library(xlsx)
runtable.filepath<-file.path(home.dir,"scratch_palmer/SARC_10x/SraRunTable.txt")
id.filepath<-file.path(home.dir,"scratch_palmer/SARC_10x/scRNA_ids.xlsx")
run.table<-read.table(runtable.filepath,sep="\t",header=T,check.names = F)
id.table<-read.xlsx(id.filepath,sheetIndex = 1,check.names=F)WARNING: An illegal reflective access operation has occurred
WARNING: Illegal reflective access by org.apache.poi.util.SAXHelper (file:/home/yanxiting/MyTools/R4.1.1/R-4.1.1/library/xlsxjars/java/poi-ooxml-3.10.1-20140818.jar) to method com.sun.org.apache.xerces.internal.util.SecurityManager.setEntityExpansionLimit(int)
WARNING: Please consider reporting this to the maintainers of org.apache.poi.util.SAXHelper
WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations
WARNING: All illegal access operations will be denied in a future release
my.table<-run.table[run.table$`GEO_Accession (exp)`%in%as.matrix(id.table)[,1],]
filenames<-list.files(file.path(home.dir,"scratch_palmer/SARC_10x/cellranger_results"))
# use Read10x to load in the data
dir.list<-rep("",length(filenames))
sample.names<-character()
for(i in 1:length(filenames)){
if(substr(filenames[i],1,1)=="G"){
dir.list[i]<-file.path(home.dir,"scratch_palmer/SARC_10x/cellranger_results",filenames[i],"outs","count","filtered_feature_bc_matrix")
}else{
dir.list[i]<-file.path(home.dir,"scratch_palmer/SARC_10x/cellranger_results",filenames[i],"outs","filtered_feature_bc_matrix")
}
if(substr(filenames[i],1,1)=="G"){
sample.names<-c(sample.names,filenames[i])
}else{
sample.names<-c(sample.names,as.matrix(my.table)[my.table$Run==filenames[i],"Sample Name"])
}
}
# duplicated gene symbols will be changed to .1, .2, and so on.
temp<-Read10X(data.dir=dir.list,gene.column=2,cell.column=1)
temp.num<-as.numeric(sapply(colnames(temp),my.element.extract,splitchar="_",index=1))
temp1<-sample.names[temp.num]
temp2<-sapply(colnames(temp),my.element.extract,splitchar="_",index=-1)
colnames(temp)<-paste(temp1,"_",temp2,sep="")
output.filepath<-"/home/yanxiting/driver_Grace/scratch_palmer/SARC_10x/merged_nUMI_45_genenames_Read10X.rds"
saveRDS(temp,file=output.filepath,refhook=NULL)
#gcql1<-CreateSeuratObject(counts=temp)
merged.data<-numeric()
for(i in 1:length(filenames)){
if(substr(filenames[i],1,1)=="G"){
matrix.dir<-file.path(home.dir,"scratch_palmer/SARC_10x/cellranger_results",filenames[i],"outs","count","filtered_feature_bc_matrix")
}else{
matrix.dir<-file.path(home.dir,"scratch_palmer/SARC_10x/cellranger_results",filenames[i],"outs","filtered_feature_bc_matrix")
}
barcode.path <- file.path(matrix.dir, "barcodes.tsv.gz")
features.path <- file.path(matrix.dir, "features.tsv.gz")
matrix.path <- file.path(matrix.dir, "matrix.mtx.gz")
mat <- readMM(file = matrix.path)
feature.names = read.delim(features.path,
header = FALSE,
stringsAsFactors = FALSE)
barcode.names = read.delim(barcode.path,
header = FALSE,
stringsAsFactors = FALSE)
if(substr(filenames[i],1,1)=="G"){
sample.name<-filenames[i]
}else{
sample.name<-as.matrix(my.table)[my.table$Run==filenames[i],"Sample Name"]
}
colnames(mat) = paste0(sample.name,"_",barcode.names$V1)
rownames(mat) = feature.names$V2
if(i==1){
merged.data<-mat
gene.names<-rownames(mat)
}else{
merged.data<-cbind(merged.data,mat[gene.names,])
}
}
output.filepath<-"/home/yanxiting/driver_Grace/scratch_palmer/SARC_10x/merged_nUMI_45_genenames.rds"
saveRDS(merged.data,file=output.filepath,refhook=NULL)We transferred the cellranger results back to farnam to extract the nUMI results. We also add the samples with multiple runs into the merged matrix.
# samples with multiple runs have names of the GEO accession number.
source("/home/yanxiting/driver_Farnam/Rprogram/my_functions.R")
library(Matrix)
cellranger.dir<-"/home/yanxiting/driver_Farnam/scratch60/SARC_10X/cellranger_results"
runtable.filepath<-"/home/yanxiting/driver_Farnam/scratch60/SARC_10X/SraRunTable.txt"
# load in the run table
run.table<-read.table(runtable.filepath,sep="\t",header=T,check.names=F,stringsAsFactors = FALSE)
# load in the run names and the GSM IDs under the cell ranger results folder
temp.filenames<-list.files(cellranger.dir)
sample.rep<-temp.filenames[substr(temp.filenames,1,3)=="GSM"]
sample.uni<-temp.filenames[substr(temp.filenames,1,3)!="GSM"]
my.run.table<-run.table[as.matrix(run.table)[,"Sample Name"]%in%sample.rep | as.matrix(run.table)[,"Run"]%in%sample.uni,]
# extract the nUMI matrix
dirnames<-list.files(cellranger.dir)
dirnames<-dirnames[substr(dirnames,1,4)=="SRR1"]
for(i in 1:length(dirnames)){
run.name<-dirnames[i]
sample.name<-run.table[run.table[,"Run"]==run.name,"Sample Name"]
matrix_dir<-file.path(cellranger.dir,dirnames[i],"outs","filtered_feature_bc_matrix")
barcode.path <- file.path(matrix_dir, "barcodes.tsv.gz")
features.path <- file.path(matrix_dir, "features.tsv.gz")
matrix.path <- file.path(matrix_dir, "matrix.mtx.gz")
mat <- readMM(file = matrix.path)
feature.names = read.delim(features.path,
header = FALSE,
stringsAsFactors = FALSE)
barcode.names = read.delim(barcode.path,
header = FALSE,
stringsAsFactors = FALSE)
colnames(mat) = paste0(sample.name,"_",barcode.names$V1)
rownames(mat) = feature.names$V1
output.filepath<-file.path("/home/yanxiting/driver_Farnam/scratch60/SARC_10X/cellranger_results",paste0(sample.name,"_",run.name,"_nUMI.rds"))
saveRDS(mat,file=output.filepath,refhook=NULL)
}Since we have limited storage on the hpc, we back up the fastq.gz files and the cell ranger output folders. We extracted the nUMI matrix of each sample separately and deleted all other files to make room. All fastq.gz files and their corresponding cell ranger output folders are backed up on google drive under /Volumes/GoogleDrive/My Drive/GraceBackup/GRADS/SARC_PBMC/SARC_10X.
We load the data into Seurat for downstream analysis, especially for the integrated analysis.
First, we create the Seurat object and preliminary filtering on genes and
# due to the unavailability of the cluster, we ran these codes on tac4 server with the same versions of R and packages
# srun --pty --x11 -p pi_kaminski -t 12:00:00 --ntasks=1 --nodes=1 --cpus-per-task=1 --mem=60152 bash
# module restore seurat
#home.dir<-"/home/yanxiting/driver_Farnam"
home.dir<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC"
source("/home/yanxiting/Rprogram/my_functions.R")
#output.dir<-file.path(home.dir,"scratch60/SARC_10X/Seurat")
output.dir<-file.path(home.dir,"scRNA-seq/Seurat")
if(file.exists(output.dir)==F){
dir.create(output.dir)
}
# Load the merged data
merged.data<-readRDS(file.path(home.dir,"Data/merged_nUMI_45_genenames_Read10X.rds"),refhook=NULL)
#geoff.data <- Read10X(data.dir = file.path(data.dir,"outs","filtered_gene_bc_matrices_mex","GRCh38"))
#load(file.path(home.dir,"scratch_kaminski/public/Backup/Jonas/R_objects/10x_ChuppAsthma.mtx.hybrid_gene_symbols_wo_background_03_1119.Robj"))
gcql1 <- CreateSeuratObject(counts = merged.data, project = "SARC_10X",min.cells = 3, min.features = 200)
gcql1[["percent.mt"]] <- PercentageFeatureSet(gcql1, pattern = "^MT-")
cat("\n")cat("Originally, there are ",nrow(merged.data)," genes and ",ncol(merged.data)," cells in the data.\n",sep="")Originally, there are 36601 genes and 115826 cells in the data.
cat("After the first step filtering, there are ",nrow(gcql1@assays$RNA@data)," genes and ",ncol(gcql1@assays$RNA@data)," cells in the filtered data.\n",sep="")After the first step filtering, there are 23851 genes and 73145 cells in the filtered data.
rm(merged.data)
#gc(verbose=F)
invisible(gc())
# load in the phenotype data of all the samples
id.filepath<-file.path(home.dir,"Data/scRNA_ids.xlsx")
runtable.filepath<-file.path(home.dir,"Data/SraRunTable.txt")
run.table<-read.table(runtable.filepath,sep="\t",header=T,check.names = F)
id.table<-read.xlsx(id.filepath,sheetIndex = 1,check.names=F)
# get the sample names for all the cells
my.ident<-unname(sapply(colnames(gcql1@assays$RNA@counts),my.element.extract,splitchar="_",index=1))
names(my.ident)<-colnames(gcql1@assays$RNA@counts)
temp<-split(as.matrix(run.table)[,"study_classification"],as.matrix(run.table)[,"Sample Name"])
temp<-lapply(temp,unique)
my.disease<-unlist(temp[my.ident])
my.samplenames<-my.ident
disease.samplenames<-paste(my.disease,"_",my.samplenames,sep="")
# scRNA-seq QC metric.
#mito.genes1 <- grep(pattern = "^MT-", x = rownames(gcql1@assays$RNA@counts), value = TRUE)
#percent.mito1 <- Matrix::colSums(gcql1@assays$RNA@counts[mito.genes1, ])/Matrix::colSums(gcql1@assays$RNA@counts)
# AddMetaData adds columns to object@meta.data, and is a great place to
# stash QC stats
gcql1 <- AddMetaData(object = gcql1, metadata = my.disease, col.name = "disease")
gcql1 <- AddMetaData(object = gcql1, metadata = my.samplenames, col.name = "sample.names")
gcql1 <- AddMetaData(object = gcql1, metadata = my.ident, col.name = "sample")
gcql1 <- AddMetaData(object = gcql1, metadata = disease.samplenames, col.name = "disease.samplenames")
temp<-rep("pbmc",nrow(gcql1@meta.data))
gcql1 <- AddMetaData(object = gcql1, metadata = temp, col.name = "tissue")
# filter the data to only keep genes present (>0 in >=1% of all the cells) in >=3 subjects
gene.pres<-numeric()
sample.names.unique<-unique(gcql1@meta.data$sample.names)
for(i in 1:length(sample.names.unique)){
temp.matrix<-gcql1@assays$RNA@counts[,gcql1@meta.data$sample.names==sample.names.unique[i]]
temp.vect<-apply(temp.matrix>0,1,sum)
gene.pres<-cbind(gene.pres,temp.vect>=(ncol(temp.matrix)*0.01))
}
gene.names<-rownames(gcql1@assays$RNA@counts)[apply(gene.pres,1,sum)>=3]
gcql1.orig<-gcql1
gcql1<-subset(gcql1.orig,features=gene.names)
cat("After the first and second steps of filtering, there are ",nrow(gcql1@assays$RNA@counts)," genes and ",ncol(gcql1@assays$RNA@counts)," cells in the data.\n",sep="")After the first and second steps of filtering, there are 12834 genes and 73145 cells in the data.
Second, we had a third filtering step on cells based on the nGene and percent.mito. Here are the plots to help us decide the threshold of the filtering.
#g1<-VlnPlot(object = gcql1, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,group.by="samplenames",combine=T)
g2<-VlnPlot(object = gcql1, features= c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3,group.by = "disease.samplenames",combine=T)
grid.arrange(g2,ncol=1)# only keep cells with nFeature_RNA>=330 or <=6000
#gcql1<-subset(gcql1,subset= nFeature_RNA >= 300 & nFeature_RNA<= 6000 & percent.mt<= 50)
gcql1<-subset(gcql1,subset= nFeature_RNA >= 300 & percent.mt<= 50)
cat("After the third filtering step, we have ",nrow(gcql1@assays$RNA@counts)," genes and ",ncol(gcql1@assays$RNA@counts)," cells.\n",sep="")After the third filtering step, we have 12834 genes and 70244 cells.
Based on the quality control plots, in the third filtering step, we removed cells with >50% mitochondrial percentage or <300 expressed genes. After this second filtering step, we have 12834 genes and 70244 cells remained.
Third, we normalize the unimputed and save the SAVER imputed data as the normalized data. These data are saved for downstream further processing.
# normalize the data and save the results
gcql1 <- NormalizeData(gcql1, normalization.method = "LogNormalize", scale.factor = 10000)Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
saveRDS(gcql1, file = file.path(output.dir,"1_normalized_data_unimputed_1.rds"))
#gcql1.saver<-NormalizeData(object = gcql1.saver, normalization.method = "LogNormalize", scale.factor = 10000)
#saveRDS(gcql1.saver, file = file.path(output.dir,"preprocessed_data_saverimputed_1.rds"))
cat("the preprocessed data for the unimputed data was saved as ",file.path(output.dir,"1_normalized_data_unimputed_1.rds"),"\n",sep="")the preprocessed data for the unimputed data was saved as /home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA-seq/Seurat/1_normalized_data_unimputed_1.rds
cat("After the normalization, there are ",nrow(gcql1@assays$RNA@data)," genes and ",ncol(gcql1@assays$RNA@data)," cells\n",sep="")After the normalization, there are 12834 genes and 70244 cells
#cat("the preprocessed data for the SAVER imputed data was saved as ",file.path(output.dir,"preprocessed_data_saverimputed_1.rds"),"\n",sep="")First, we find variable genes.
features.n<-2000
home.dir<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC"We load in the normalized unimputed data and find the top 2000 variable genes for cell clustering. The normalized data was further scaled to remove the effect of nUMI and percent.mito for further cell clustering purpose.
# load in the normalized unimputed data
data.filepath<-file.path(home.dir,"scRNA-seq/Seurat/1_normalized_data_unimputed_1.rds")
output.dir<-file.path(home.dir,"scRNA-seq/Seurat")
if(file.exists(output.dir)==F){
dir.create(output.dir)
}
gcql1<-readRDS(file = data.filepath, refhook = NULL)
gcql1<-FindVariableFeatures(gcql1,selection.method="vst",nfeatures=features.n)Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(gcql1), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(gcql1)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)When using repel, set xnudge and ynudge to 0 for optimal results
CombinePlots(plots = list(plot1, plot2))Then we scale the data to remove the effect from nUMI and percent.mito. Here we decide to scale for all genes so that later on, the heatmap will be generated using the scaled data.
#maybe regress cell cycle http://satijalab.org/seurat/cell_cycle_vignette.html
all.genes<-rownames(gcql1)
#gcql1 <- ScaleData(gcql1,features = all.genes, vars.to.regress = c("nCount_RNA", "percent.mt"))
gcql1 <- ScaleData(gcql1)#output.filepath<-file.path(output.dir,"2_scaled_allgenes_unimputed.rds")
#saveRDS(gcql1, file = output.filepath)We perform the principal component analysis on the variable genes to decide the number of PCs to use for clustering.
gcql1 <- RunPCA(object = gcql1, features = VariableFeatures(gcql1), npcs=200,verbose=F)To see if there are PCs that are subject specific, the 2D PCA plots using the top 3 PCs are as follows for the un-imputed data.
#par(mfrow=c(2,2))
fig.1<-DimPlot(object = gcql1, dims = c(1,2), pt.size=0.5)
fig.2<-DimPlot(object = gcql1, dims = c(2,3), pt.size=0.5)
fig.3<-DimPlot(object = gcql1, dims = c(1,3), pt.size=0.5)
grid.arrange(fig.1,fig.2,fig.3,ncol=1)We draw the heatmap of the top genes for each PC.
DimHeatmap(gcql1, dims = 1:10, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 11:20, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 21:30, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 31:40, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 41:50, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 51:60, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 61:70, cells = 500, balanced = TRUE)DimHeatmap(gcql1, dims = 71:80, cells = 500, balanced = TRUE)We use the following Jackstraw plots and the Elbowplot to decide the number of PCs to use for downstream analysis.
gcql1 <- JackStraw(object = gcql1, num.replicate = 100, verbose = TRUE,dims=200)
| | 0 % ~calculating
|+ | 1 % ~02d 04h 52m 05s
|+ | 2 % ~02d 05h 28m 37s
|++ | 3 % ~02d 05h 09m 39s
|++ | 4 % ~02d 05h 35m 14s
|+++ | 5 % ~02d 05h 43m 55s
|+++ | 6 % ~02d 05h 13m 21s
|++++ | 7 % ~02d 06h 11m 00s
|++++ | 8 % ~02d 05h 23m 53s
|+++++ | 9 % ~02d 04h 30m 49s
|+++++ | 10% ~02d 04h 11m 26s
|++++++ | 11% ~02d 03h 29m 17s
|++++++ | 12% ~02d 02h 42m 11s
|+++++++ | 13% ~02d 02h 24m 55s
|+++++++ | 14% ~02d 01h 41m 47s
|++++++++ | 15% ~02d 01h 25m 26s
|++++++++ | 16% ~02d 00h 54m 42s
|+++++++++ | 17% ~02d 00h 03m 36s
|+++++++++ | 18% ~01d 23h 29m 41s
|++++++++++ | 19% ~01d 22h 45m 33s
|++++++++++ | 20% ~01d 21h 50m 30s
|+++++++++++ | 21% ~01d 21h 14m 04s
|+++++++++++ | 22% ~01d 20h 34m 58s
|++++++++++++ | 23% ~01d 20h 08m 32s
|++++++++++++ | 24% ~01d 19h 32m 28s
|+++++++++++++ | 25% ~01d 18h 53m 49s
|+++++++++++++ | 26% ~01d 18h 14m 19s
|++++++++++++++ | 27% ~01d 17h 39m 34s
|++++++++++++++ | 28% ~01d 17h 11m 30s
|+++++++++++++++ | 29% ~01d 16h 46m 55s
|+++++++++++++++ | 30% ~01d 16h 16m 34s
|++++++++++++++++ | 31% ~01d 15h 35m 32s
|++++++++++++++++ | 32% ~01d 14h 56m 52s
|+++++++++++++++++ | 33% ~01d 14h 25m 56s
|+++++++++++++++++ | 34% ~01d 13h 51m 05s
|++++++++++++++++++ | 35% ~01d 13h 17m 15s
|++++++++++++++++++ | 36% ~01d 12h 39m 13s
|+++++++++++++++++++ | 37% ~01d 12h 00m 52s
|+++++++++++++++++++ | 38% ~01d 11h 35m 20s
|++++++++++++++++++++ | 39% ~01d 11h 01m 13s
|++++++++++++++++++++ | 40% ~01d 10h 23m 26s
|+++++++++++++++++++++ | 41% ~01d 09h 47m 04s
|+++++++++++++++++++++ | 42% ~01d 09h 13m 18s
|++++++++++++++++++++++ | 43% ~01d 08h 37m 22s
|++++++++++++++++++++++ | 44% ~01d 08h 05m 25s
|+++++++++++++++++++++++ | 45% ~01d 07h 30m 49s
|+++++++++++++++++++++++ | 46% ~01d 06h 52m 44s
|++++++++++++++++++++++++ | 47% ~01d 06h 15m 28s
|++++++++++++++++++++++++ | 48% ~01d 05h 41m 15s
|+++++++++++++++++++++++++ | 49% ~01d 05h 06m 05s
|+++++++++++++++++++++++++ | 50% ~01d 04h 36m 22s
|++++++++++++++++++++++++++ | 51% ~01d 04h 02m 26s
|++++++++++++++++++++++++++ | 52% ~01d 03h 26m 60s
|+++++++++++++++++++++++++++ | 53% ~01d 02h 49m 39s
|+++++++++++++++++++++++++++ | 54% ~01d 02h 15m 55s
|++++++++++++++++++++++++++++ | 55% ~01d 01h 41m 50s
|++++++++++++++++++++++++++++ | 56% ~01d 01h 07m 07s
|+++++++++++++++++++++++++++++ | 57% ~01d 00h 32m 10s
|+++++++++++++++++++++++++++++ | 58% ~23h 56m 19s
|++++++++++++++++++++++++++++++ | 59% ~23h 20m 40s
|++++++++++++++++++++++++++++++ | 60% ~22h 47m 28s
|+++++++++++++++++++++++++++++++ | 61% ~22h 14m 05s
|+++++++++++++++++++++++++++++++ | 62% ~21h 40m 42s
|++++++++++++++++++++++++++++++++ | 63% ~21h 06m 44s
|++++++++++++++++++++++++++++++++ | 64% ~20h 32m 04s
|+++++++++++++++++++++++++++++++++ | 65% ~19h 56m 27s
|+++++++++++++++++++++++++++++++++ | 66% ~19h 22m 30s
|++++++++++++++++++++++++++++++++++ | 67% ~18h 48m 31s
|++++++++++++++++++++++++++++++++++ | 68% ~18h 14m 06s
|+++++++++++++++++++++++++++++++++++ | 69% ~17h 38m 53s
|+++++++++++++++++++++++++++++++++++ | 70% ~17h 03m 54s
|++++++++++++++++++++++++++++++++++++ | 71% ~16h 30m 03s
|++++++++++++++++++++++++++++++++++++ | 72% ~15h 55m 08s
|+++++++++++++++++++++++++++++++++++++ | 73% ~15h 21m 54s
|+++++++++++++++++++++++++++++++++++++ | 74% ~14h 48m 03s
|++++++++++++++++++++++++++++++++++++++ | 75% ~14h 14m 20s
|++++++++++++++++++++++++++++++++++++++ | 76% ~13h 39m 42s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~13h 06m 04s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~12h 32m 35s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~11h 58m 06s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~11h 23m 56s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~10h 49m 43s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~10h 14m 45s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~09h 41m 07s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~09h 06m 34s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~08h 32m 37s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~07h 57m 47s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~07h 23m 31s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~06h 49m 05s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~06h 14m 29s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~05h 40m 14s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~05h 06m 20s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~04h 32m 23s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~03h 58m 25s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~03h 24m 18s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~02h 50m 07s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~02h 15m 56s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~01h 42m 00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~01h 07m 59s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~33m 59s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=02d 08h 37m 33s
| | 0 % ~calculating
|+ | 1 % ~06s
|+ | 2 % ~06s
|++ | 3 % ~11s
|++ | 4 % ~09s
|+++ | 5 % ~08s
|+++ | 6 % ~07s
|++++ | 7 % ~08s
|++++ | 8 % ~07s
|+++++ | 9 % ~07s
|+++++ | 10% ~07s
|++++++ | 11% ~06s
|++++++ | 12% ~06s
|+++++++ | 13% ~06s
|+++++++ | 14% ~06s
|++++++++ | 15% ~06s
|++++++++ | 16% ~06s
|+++++++++ | 17% ~05s
|+++++++++ | 18% ~05s
|++++++++++ | 19% ~05s
|++++++++++ | 20% ~05s
|+++++++++++ | 21% ~05s
|+++++++++++ | 22% ~05s
|++++++++++++ | 23% ~05s
|++++++++++++ | 24% ~05s
|+++++++++++++ | 25% ~05s
|+++++++++++++ | 26% ~05s
|++++++++++++++ | 27% ~05s
|++++++++++++++ | 28% ~04s
|+++++++++++++++ | 29% ~04s
|+++++++++++++++ | 30% ~04s
|++++++++++++++++ | 31% ~04s
|++++++++++++++++ | 32% ~04s
|+++++++++++++++++ | 33% ~04s
|+++++++++++++++++ | 34% ~04s
|++++++++++++++++++ | 35% ~04s
|++++++++++++++++++ | 36% ~04s
|+++++++++++++++++++ | 37% ~04s
|+++++++++++++++++++ | 38% ~04s
|++++++++++++++++++++ | 39% ~04s
|++++++++++++++++++++ | 40% ~04s
|+++++++++++++++++++++ | 41% ~04s
|+++++++++++++++++++++ | 42% ~03s
|++++++++++++++++++++++ | 43% ~03s
|++++++++++++++++++++++ | 44% ~03s
|+++++++++++++++++++++++ | 45% ~03s
|+++++++++++++++++++++++ | 46% ~03s
|++++++++++++++++++++++++ | 47% ~03s
|++++++++++++++++++++++++ | 48% ~03s
|+++++++++++++++++++++++++ | 49% ~03s
|+++++++++++++++++++++++++ | 50% ~03s
|++++++++++++++++++++++++++ | 51% ~03s
|++++++++++++++++++++++++++ | 52% ~03s
|+++++++++++++++++++++++++++ | 53% ~03s
|+++++++++++++++++++++++++++ | 54% ~03s
|++++++++++++++++++++++++++++ | 55% ~03s
|++++++++++++++++++++++++++++ | 56% ~03s
|+++++++++++++++++++++++++++++ | 57% ~03s
|+++++++++++++++++++++++++++++ | 58% ~03s
|++++++++++++++++++++++++++++++ | 59% ~02s
|++++++++++++++++++++++++++++++ | 60% ~02s
|+++++++++++++++++++++++++++++++ | 61% ~02s
|+++++++++++++++++++++++++++++++ | 62% ~02s
|++++++++++++++++++++++++++++++++ | 63% ~02s
|++++++++++++++++++++++++++++++++ | 64% ~02s
|+++++++++++++++++++++++++++++++++ | 65% ~02s
|+++++++++++++++++++++++++++++++++ | 66% ~02s
|++++++++++++++++++++++++++++++++++ | 67% ~02s
|++++++++++++++++++++++++++++++++++ | 68% ~02s
|+++++++++++++++++++++++++++++++++++ | 69% ~02s
|+++++++++++++++++++++++++++++++++++ | 70% ~02s
|++++++++++++++++++++++++++++++++++++ | 71% ~02s
|++++++++++++++++++++++++++++++++++++ | 72% ~02s
|+++++++++++++++++++++++++++++++++++++ | 73% ~02s
|+++++++++++++++++++++++++++++++++++++ | 74% ~02s
|++++++++++++++++++++++++++++++++++++++ | 75% ~01s
|++++++++++++++++++++++++++++++++++++++ | 76% ~01s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~01s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~01s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~01s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~01s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~01s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~01s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~01s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~01s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~01s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~01s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~01s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=06s
gcql1 <- ScoreJackStraw(gcql1, dims = 1:200)JackStrawPlot(gcql1, dims = 1:100)ElbowPlot(object = gcql1,ndims=100)pc.num<-46Based on the JackStraw plot and the Elbow plot, we decided to use the top 46 PCs for the unimputed data.
saveRDS(gcql1, file = file.path(output.dir,paste("2_dimreduction_data_",length(gcql1@assays$RNA@var.features),"vargenes_pipeline1.rds",sep="")))
# we save the file as preprocessed_data_saver.rds to prevent any mistakes but we change the file name back to preprocessed_data.rds before performing downstream analysis.
cat("We save the R project as\n")We save the R project as
cat("unimputed data:\t",file.path(output.dir,paste("2_dimreduction_data_",length(gcql1@assays$RNA@var.features),"vargenes_pipeline1.rds",sep="")),"\n",sep="")unimputed data: /home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA-seq/Seurat/2_dimreduction_data_2000vargenes_pipeline1.rds
Based on the Jackstraw plots, we decided to use the 46 top PCs for clustering.
To identify potential outlying cells and samples, we first label the tSNE plot using disease_samplenames.
sample1<-FindNeighbors(gcql1,dims=1:pc.num)Computing nearest neighbor graph
Computing SNN
sample1<-FindClusters(sample1,resolution=1.2)Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 70244
Number of edges: 2398867
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8762
Number of communities: 40
Elapsed time: 23 seconds
sample1<-RunUMAP(sample1,dims=1:pc.num)22:23:44 UMAP embedding parameters a = 0.9922 b = 1.112
22:23:44 Read 70244 rows and found 46 numeric columns
22:23:44 Using Annoy for neighbor search, n_neighbors = 30
22:23:44 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
22:23:59 Writing NN index file to temp file /tmp/RtmpGbMf9o/file31239293ae5
22:23:59 Searching Annoy index using 1 thread, search_k = 3000
22:24:35 Annoy recall = 100%
22:24:37 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
22:24:42 Initializing from normalized Laplacian + noise (using irlba)
22:24:49 Commencing optimization for 200 epochs, with 3281762 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
22:26:36 Optimization finished
#sample1<-RunUMAP(sample1,dims=1:pc.num,umap.method="umap-learn")fig1<-DimPlot(sample1,reduction="umap")
fig2<-DimPlot(sample1,reduction="umap",group.by="disease.samplenames")
fig3<-DimPlot(sample1,reduction="umap",group.by="disease")
grid.arrange(fig1,fig2,fig3,ncol=1)pc.num<-46
# generate the TSNE plot labeled by labeling samples with different colors
sample1<-FindNeighbors(gcql1,dims=1:pc.num)sample1<-FindClusters(sample1,resolution=1.2)sample1<-RunUMAP(sample1,dims=1:pc.num)#sample1<-RunUMAP(sample1,dims=1:pc.num,umap.method="umap-learn")
fig1<-DimPlot(sample1,reduction="umap")
fig2<-DimPlot(sample1,reduction="umap",group.by="disease.samplenames")
fig3<-DimPlot(sample1,reduction="umap",group.by="disease")
grid.arrange(fig1,fig2,fig3,ncol=1)pc.num<-35
sample1 <- RunTSNE(sample1, dims= 1:pc.num)saveRDS(sample1, file = file.path(output.dir,paste("2_visualization_data_",length(gcql1@assays$RNA@var.features),"vargenes_pipeline1.rds",sep="")))
cat("We saved the seurat object with UMAP and tSNE as ",file.path(output.dir,paste("2_visualization_data_",length(sample1@assays$RNA@var.features),"vargenes_pipeline1.rds",sep="")),"\n",sep="")We saved the seurat object with UMAP and tSNE as /home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA-seq/Seurat/2_visualization_data_2000vargenes_pipeline1.rds
my.distinct.colors<-c('#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4', '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', '#000075', '#808080', '#ffffff', '#000000')To describe the cell types captured in the data, the integrated analysis may provide cleaner results. Therefore, we conduct the integrated analysis of all the asthma 10X data using Seurat V3.1 in this note to reduce the subject effect in the data visulization and the trajectory analysis at least.
########################################################
# 1. load in the singler object
data.dir<-file.path(home.dir,"scRNA-seq/Seurat")
sample1<-readRDS(file.path(data.dir,"2_visualization_data_2000vargenes_pipeline1.rds"),refhook = NULL)
sample1.raw<-sample1Split the whole dataset based on subject ID (fresh and DSMO samples from the same subject are considered as one sample). Find anchors to integrate all the datasets together.
sample1.list<-SplitObject(sample1.raw,split.by="sample.names")
sample1.list<-lapply(X=sample1.list,FUN=function(x){
x<-NormalizeData(x)
x<-FindVariableFeatures(x,selection.method="vst",nfeatures=2000)
})Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
temp.cellnum<-unlist(lapply(sample1.list,ncol))
# increase the global maxSize
options(future.globals.maxSize= 4194304000)
#sample1.anchors<-FindIntegrationAnchors(sample1.list[temp.cellnum>=92],k.filter=92,dims=1:40,verbose=FALSE)
sample1.anchors<-FindIntegrationAnchors(sample1.list[temp.cellnum>=40],k.filter=40,dims=1:40,verbose=FALSE)
| | 0 % ~calculating
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+ | 1 % ~06h 43m 19s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++ | 2 % ~04h 32m 28s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++ | 3 % ~03h 50m 49s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++ | 4 % ~03h 20m 10s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++ | 6 % ~02h 58m 02s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++ | 7 % ~02h 35m 25s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++ | 8 % ~02h 57m 22s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++ | 9 % ~03h 02m 08s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++ | 10% ~02h 50m 25s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++++ | 11% ~02h 56m 59s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++++ | 12% ~02h 51m 05s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++++ | 13% ~03h 01m 15s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++++++ | 14% ~02h 59m 06s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++++++ | 16% ~03h 00m 55s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++++++ | 17% ~02h 57m 15s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|+++++++++ | 18% ~02h 58m 28s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++++++++ | 19% ~02h 54m 60s
Warning in irlba(A = mat3, nv = num.cc) :
You're computing too large a percentage of total singular values, use a standard svd instead.
|++++++++++ | 20% ~02h 59m 36s
Cluster the cells using the integrated data.
We save the result in a rds file.
Split the whole dataset based on subject ID (fresh and DSMO samples from the same subject are considered as one sample). Find anchors to integrate all the datasets together.
sample1.list<-SplitObject(sample1.raw,split.by="sample.names")
sample1.list<-lapply(X=sample1.list,FUN=function(x){
x<-NormalizeData(x)
x<-FindVariableFeatures(x,selection.method="vst",nfeatures=2000)
})
temp.cellnum<-unlist(lapply(sample1.list,ncol))
# increase the global maxSize
options(future.globals.maxSize= 4194304000)
#sample1.anchors<-FindIntegrationAnchors(sample1.list[temp.cellnum>=92],k.filter=92,dims=1:40,verbose=FALSE)
sample1.anchors<-FindIntegrationAnchors(sample1.list[temp.cellnum>=40],k.filter=40,dims=1:40,verbose=FALSE)
#sample1.combined<-IntegrateData(anchorset=sample1.anchors,features.to.integrate=rownames(sample1.raw@assays$RNA@counts),dims=1:40,verbose=FALSE)
sample1.combined<-IntegrateData(anchorset=sample1.anchors,dims=1:40,verbose=FALSE)dir.create(file.path(output.dir,"integrated_analysis"))
output.filepath<-file.path(output.dir,"integrated_analysis","seurat_object_integrated_allcells.rds")
saveRDS(sample1.combined,file=output.filepath)
cat("We saved the integrated data for all cells as ",output.filepath,"\n",sep="")
output.filepath<-file.path(output.dir,"integrated_analysis","seurat_object_original_allcells.rds")
saveRDS(sample1,file=output.filepath)
cat("We saved the original data for all cells as ",output.filepath,"\n",sep="")Cluster the cells using the integrated data.
DefaultAssay(sample1.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
sample1.combined <- ScaleData(sample1.combined, verbose = FALSE)
sample1.combined <- RunPCA(sample1.combined, npcs = 40, verbose = FALSE)
# t-SNE and Clustering
sample1.combined <- RunUMAP(sample1.combined, reduction = "pca", dims = 1:40)
sample1.combined <- FindNeighbors(sample1.combined, reduction = "pca", dims = 1:40)
sample1.combined <- FindClusters(sample1.combined, resolution = 1.2)We save the result in a rds file.
output.subdir<-file.path(output.dir,"integrated_analysis")
if(file.exists(output.subdir)==F){
dir.create(output.subdir)
}
output.filepath<-file.path(output.subdir,"seurat_object_integrated_allcells_cellclustering_noSingleR.rds")
saveRDS(sample1.combined,file=output.filepath)
cat("We saved the seurat object of the integrated data with cell clustering results only as ",output.filepath,"\n",sep="")library(cowplot)
p1 <- DimPlot(sample1.combined, reduction = "umap", group.by = "sample.names")
p2 <- DimPlot(sample1.combined, reduction = "umap", label = TRUE)
#p3<-DimPlot(sample1.combined, reduction = "umap", group.by="singler.hpca.cluster.merged", label = TRUE,cols=my.distinct.colors)
#p4<-DimPlot(sample1.combined, reduction = "umap", group.by="singler.hpca.cluster", label = TRUE,cols=my.distinct.colors)
p5<-DimPlot(sample1.combined, reduction = "umap", group.by="disease", label = TRUE,cols=my.distinct.colors)
#plot_grid(p2, p1, p3,p4,p5,ncol=1)
plot_grid(p2, p1,p5,ncol=1)We obtained another PBMC 10X data annotated in Kaminski lab so that we can integrate this dataset with the annotated data to annotate the cells in this dataset. The seurat object containing the annotated PBMC 10x data is under /home/xy48/scratch_palmer/Amy_PBMC on grace hpc. We ran this section of codes on hpc.
We first load in the annotated PBMC 10x data from Amy.
We then load in the integrated seurat object of our data.
data.filepath<-"/home/yanxiting/Documents/Research/GRADS_SARC_PBMC/scRNA-seq/Seurat/integrated_analysis/seurat_object_integrated_allcells_cellclustering_noSingleR.rds"
sample1.combined<-readRDS(data.filepath,refhook=NULL)Merge the raw count data from these two sources for integration analysis.
merged.data<-cbind(sample1.combined@assays$RNA@counts,amy.10x@assays$RNA@counts)
merged.metadata<-
merge.data<-CreateSeuratObject(counts=merged.data,project="MergedSeuratObject",assay="RNA",meta.data = )Split the whole dataset based on subject ID. Find anchors to integrate all the datasets together.
sample1.list<-SplitObject(sample1.raw,split.by="subject.names")
sample1.list<-lapply(X=sample1.list,FUN=function(x){
x<-NormalizeData(x)
x<-FindVariableFeatures(x,selection.method="vst",nfeatures=2000)
})
temp.cellnum<-unlist(lapply(sample1.list,ncol))
# increase the global maxSize
options(future.globals.maxSize= 4194304000)
sample1.anchors<-FindIntegrationAnchors(sample1.list[temp.cellnum>=92],k.filter=92,dims=1:40,verbose=FALSE)
#sample1.combined<-IntegrateData(anchorset=sample1.anchors,features.to.integrate=rownames(sample1.raw@assays$RNA@counts),dims=1:40,verbose=FALSE)
sample1.combined<-IntegrateData(anchorset=sample1.anchors,dims=1:40,verbose=FALSE)output.filepath<-file.path(output.dir,"integrated_analysis","seurat_object_integrated_allcells.rds")
saveRDS(sample1.combined,file=output.filepath)
cat("We saved the integrated data for all cells as ",output.filepath,"\n",sep="")
output.filepath<-file.path(output.dir,"integrated_analysis","seurat_object_original_allcells.rds")
saveRDS(sample1,file=output.filepath)
cat("We saved the original data for all cells as ",output.filepath,"\n",sep="")Cluster the cells using the integrated data.
DefaultAssay(sample1.combined) <- "integrated"
# Run the standard workflow for visualization and clustering
sample1.combined <- ScaleData(sample1.combined, verbose = FALSE)
sample1.combined <- RunPCA(sample1.combined, npcs = 40, verbose = FALSE)
# t-SNE and Clustering
sample1.combined <- RunUMAP(sample1.combined, reduction = "pca", dims = 1:40)
sample1.combined <- FindNeighbors(sample1.combined, reduction = "pca", dims = 1:40)
sample1.combined <- FindClusters(sample1.combined, resolution = 1.2)We save the result in a rds file.
output.subdir<-file.path(output.dir,"integrated_analysis")
if(file.exists(output.subdir)==F){
dir.create(output.subdir)
}
output.filepath<-file.path(output.subdir,"seurat_object_integrated_allcells_cellclustering_noSingleR.rds")
saveRDS(sample1.combined,file=output.filepath)
cat("We saved the seurat object of the integrated data with cell clustering results only as ",output.filepath,"\n",sep="")p1 <- DimPlot(sample1.combined, reduction = "umap", group.by = "subject.names")
p2 <- DimPlot(sample1.combined, reduction = "umap", label = TRUE)
p3<-DimPlot(sample1.combined, reduction = "umap", group.by="singler.hpca.cluster.merged", label = TRUE,cols=my.distinct.colors)
p4<-DimPlot(sample1.combined, reduction = "umap", group.by="singler.hpca.cluster", label = TRUE,cols=my.distinct.colors)
p5<-DimPlot(sample1.combined, reduction = "umap", group.by="disease", label = TRUE,cols=my.distinct.colors)
plot_grid(p2, p1, p3,p4,p5,ncol=1)Visualize the integrated data by splitting it to different clusters.
fig.list<-list()
celltype.names<-as.numeric(unique(as.character(Idents(sample1.combined))))
celltype.names<-sort(celltype.names,decreasing=F)
for(i in 1:length(celltype.names)){
temp.map<-as.character(Idents(sample1.combined))
temp.map[temp.map!=celltype.names[i]]="other"
sample1.combined@meta.data$temp.map<-temp.map
fig.list[[2*(i-1)+1]]<-DimPlot(sample1.combined, reduction = "umap", label = FALSE,group.by="ident")
#fig.list[[2*i]]<-DimPlot(sample1.combined, reduction = "umap", label = FALSE,group.by="temp.map",cols=c(my.distinct.colors[i],"grey"))
fig.list[[2*i]]<-DimPlot(sample1.combined, reduction = "umap", label = FALSE,group.by="temp.map",cols=c("red","grey"))
}
do.call(grid.arrange,c(fig.list,ncol=2))Visualize the integrated data by splitting it to different subjects and different cell type annotation (old).
fig.list<-list()
celltype.names<-unique(as.character(sample1.combined@meta.data$singler.hpca.cluster))
celltype.names<-sort(celltype.names,decreasing=T)
for(i in 1:length(celltype.names)){
temp.map<-as.character(sample1.combined@meta.data$singler.hpca.cluster)
temp.map[temp.map!=celltype.names[i]]<-"zother"
sample1.combined@meta.data$temp.map<-temp.map
fig.list[[i]]<-DimPlot(sample1.combined, reduction = "umap",group.by="temp.map",label = FALSE,pt.size=0.5,cols=c(my.distinct.colors[i],"grey"))
}
do.call(grid.arrange,c(fig.list,ncol=2))We run SingleR (v 1.2.0) again based on the clustering results based on the integrated data and visualize the data using the new cell typing annotation.
# run SingleR with HPCA as reference dataset.
library(SingleR)
library(scRNAseq)
library(scater)
hpca.se <- HumanPrimaryCellAtlasData()
hpca.se
#hESCs <- LaMannoBrainData('human-es')
my.temp<-CreateSeuratObject(counts=sample1.combined@assays$RNA@counts,project = "ChuppSputum",min.cells = 0, min.features = 0)
temp<-as.SingleCellExperiment(my.temp,assay="RNA")
temp<-logNormCounts(temp)
sample1.singler.hpca<-SingleR(test=temp,ref=hpca.se,labels=hpca.se$label.fine,method="cluster",clusters=as.character(Idents(sample1.combined)))
plotScoreHeatmap(sample1.singler.hpca)Data visulization labelled by the new SingleR results on the combined data.
cluster.names<-rownames(sample1.singler.hpca)
cluster.labels<-sample1.singler.hpca$pruned.labels
temp.map<-as.character(Idents(sample1.combined))
temp.map<-plyr::mapvalues(x=temp.map,from=cluster.names,to=cluster.labels)
sample1.combined<-AddMetaData(sample1.combined,metadata=temp.map,col.name="singler.hpca.cluster.combined")Get the main cell type labels and save the annotated seurat R object.
# get the main labels
my.temp<-CreateSeuratObject(counts=sample1.combined@assays$RNA@counts,project = "ChuppSputum",min.cells = 0, min.features = 0)
temp<-as.SingleCellExperiment(my.temp,assay="RNA")
temp<-logNormCounts(temp)
sample1.singler.hpca<-SingleR(test=temp,ref=hpca.se,labels=hpca.se$label.main,method="cluster",clusters=as.character(Idents(sample1.combined)))
plotScoreHeatmap(sample1.singler.hpca)cluster.names<-rownames(sample1.singler.hpca)
cluster.labels<-sample1.singler.hpca$pruned.labels
temp.map<-as.character(Idents(sample1.combined))
temp.map<-plyr::mapvalues(x=temp.map,from=cluster.names,to=cluster.labels)
sample1.combined<-AddMetaData(sample1.combined,metadata=temp.map,col.name="singler.hpca.cluster.main.combined")
output.filepath<-file.path(output.subdir,"seurat_object_integrated_allcells_cellclustering_singler_hpca.rds")
saveRDS(sample1.combined,file=output.filepath)Visulizing the SingleR cell typing results on the combined UMAP.
fig.list<-list()
celltype.names<-unique(as.character(sample1.combined@meta.data$singler.hpca.cluster.combined))
celltype.names<-sort(celltype.names,decreasing=T)
for(i in 1:length(celltype.names)){
temp.map<-as.character(sample1.combined@meta.data$singler.hpca.cluster.combined)
temp.map[temp.map!=celltype.names[i]]<-"zother"
sample1.combined@meta.data$temp.map<-temp.map
fig.list[[i]]<-DimPlot(sample1.combined, reduction = "umap",group.by="temp.map",label = FALSE,pt.size=0.5,cols=c(my.distinct.colors[i],"grey"))
}
do.call(grid.arrange,c(fig.list,ncol=2))